Lets load the libraries we are going to use in this practice section

library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Lets load the file with experimental results from TnSeq experiment

The path to the file is different for you. Please make sure to specify a correct path. Let Alena know if you have any problems with that.

# Note: tbl_df function forwards the argument to "as_data_frame""

exp_data <- read.csv("Experimental_results.csv")
head(exp_data)
##   X BC_ID       H_0      H_24      H_48      H_72      H_96 Mut_ID Strain
## 1 1     1 -5.430079 -6.142158        NA -6.159939        NA    185 DivAnc
## 2 2    10 -5.731109        NA        NA        NA        NA    655 DivAnc
## 3 3    16 -5.731109 -6.443188        NA -6.460969        NA    648 DivAnc
## 4 4    18        NA -6.443188        NA        NA        NA    109 DivAnc
## 5 5    23        NA        NA -6.159156        NA        NA    421 DivAnc
## 6 6    30 -3.762626 -3.634977 -3.862490 -4.093613 -4.425554   1012 DivAnc
##   Environment
## 1      SC_3.0
## 2      SC_3.0
## 3      SC_3.0
## 4      SC_3.0
## 5      SC_3.0
## 6      SC_3.0

As you see, this table has several columns. Column “Strain” shows which strain was used in the experiment. “Environment” shows conditions in which microorganisms were propagated. Columns “Mut_ID” and “BC_ID” give information about type of mutations and barcodes that correspond to them. Columns “H_0”: “H_96” show frequenfy of each barcode over time.

Lets modify a table:

First, remove the column “X”. We are not going to use the information contained in it. For this we are going to use function “select”

# Remove column "X" from a dataframe:
exp_data <- select(exp_data, -X)

# Look at first 6 rows
head(exp_data)
##   BC_ID       H_0      H_24      H_48      H_72      H_96 Mut_ID Strain
## 1     1 -5.430079 -6.142158        NA -6.159939        NA    185 DivAnc
## 2    10 -5.731109        NA        NA        NA        NA    655 DivAnc
## 3    16 -5.731109 -6.443188        NA -6.460969        NA    648 DivAnc
## 4    18        NA -6.443188        NA        NA        NA    109 DivAnc
## 5    23        NA        NA -6.159156        NA        NA    421 DivAnc
## 6    30 -3.762626 -3.634977 -3.862490 -4.093613 -4.425554   1012 DivAnc
##   Environment
## 1      SC_3.0
## 2      SC_3.0
## 3      SC_3.0
## 4      SC_3.0
## 5      SC_3.0
## 6      SC_3.0

Lets get ready for plotting:

We are going to use ggplot2 package. Please check the syntax that is commonly used for plotting with this package. Does everything make sense?

# It is dificult to plot in a normal way with base plot
col.ind <- grep("H_", colnames(exp_data))

# Something like this coud do it but it is clunky and unelegent
plot(1:length(col.ind), exp_data[1, col.ind], typ = "b", ylim = c(0, max(exp_data[, col.ind], na.rm = T)))

for (i in 2:6) { # nrow(exp_data)) {
  points(1:length(col.ind), exp_data[i, col.ind], typ = "b")
}

To make a graph, we need two give a function 2 variables (2 columns) to plot them against each other.

What variables are we going to use?

Lets rearrange our table to be able to plot the data easily. Instead on keeping information about barcode frequency in rows, we are going to create a column “Time” with time points and a column “Frequency” with corresponding barcode frequencies.

# First, check how function "gather" works
exp_rearranged <- gather(exp_data, Generation, Frequency, H_0:H_96)

head(exp_rearranged)
##   BC_ID Mut_ID Strain Environment Generation Frequency
## 1     1    185 DivAnc      SC_3.0        H_0 -5.430079
## 2    10    655 DivAnc      SC_3.0        H_0 -5.731109
## 3    16    648 DivAnc      SC_3.0        H_0 -5.731109
## 4    18    109 DivAnc      SC_3.0        H_0        NA
## 5    23    421 DivAnc      SC_3.0        H_0        NA
## 6    30   1012 DivAnc      SC_3.0        H_0 -3.762626

You might have noticed that “Generation” column contains both “H” that stands for “hours” and numbers. Lets remove “H_” part from this column.

All because we want numbers to plot with (i.e. Time!)

Check the syntax of “separate” function.

# Separate values in "Generation" column into 2 columns
table_for_graph <- separate(exp_rearranged, Generation, into = c("H", "Time"))

head(table_for_graph)
##   BC_ID Mut_ID Strain Environment H Time Frequency
## 1     1    185 DivAnc      SC_3.0 H    0 -5.430079
## 2    10    655 DivAnc      SC_3.0 H    0 -5.731109
## 3    16    648 DivAnc      SC_3.0 H    0 -5.731109
## 4    18    109 DivAnc      SC_3.0 H    0        NA
## 5    23    421 DivAnc      SC_3.0 H    0        NA
## 6    30   1012 DivAnc      SC_3.0 H    0 -3.762626
# Remove column "H" using function "select"
table_for_graph <- select(table_for_graph, -H)

head(table_for_graph)
##   BC_ID Mut_ID Strain Environment Time Frequency
## 1     1    185 DivAnc      SC_3.0    0 -5.430079
## 2    10    655 DivAnc      SC_3.0    0 -5.731109
## 3    16    648 DivAnc      SC_3.0    0 -5.731109
## 4    18    109 DivAnc      SC_3.0    0        NA
## 5    23    421 DivAnc      SC_3.0    0        NA
## 6    30   1012 DivAnc      SC_3.0    0 -3.762626

You might have noticed that our table contains a lot of “NA” values. Go ahead and remove them with na.omit function. Don’t forget to check it’s syntax first!

table_cleaned <- na.omit(table_for_graph)
table_cleaned$Time <- as.numeric(table_cleaned$Time)

head(table_cleaned)
##   BC_ID Mut_ID Strain Environment Time Frequency
## 1     1    185 DivAnc      SC_3.0    0 -5.430079
## 2    10    655 DivAnc      SC_3.0    0 -5.731109
## 3    16    648 DivAnc      SC_3.0    0 -5.731109
## 6    30   1012 DivAnc      SC_3.0    0 -3.762626
## 7    38    333 DivAnc      SC_3.0    0 -5.430079
## 8    45     71 DivAnc      SC_3.0    0 -3.143398

Now the table is ready.

How are we going to plot all the values? Do we need to separate them in any way? If yes, then how?

# We might need to plot data for each strain separately..
DivAnc <- filter(table_cleaned, table_cleaned$Strain == "DivAnc")
L013 <- filter(table_cleaned, table_cleaned$Strain == "L013")
# make a plot for DivAnc strain
DivAnc_plot <- ggplot(DivAnc, aes(x = Time, y = Frequency, group = BC_ID) ) +
  geom_line(alpha = .2, colour = "#000033") + 
  ggtitle("DivAnc_SC3") + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  xlab("Time, hours") + ylab("Log10(Barcode frequency)")

DivAnc_plot

Can do a cool interactive plotly graph that we can brush over to see tren line IDs

ggplotly(DivAnc_plot)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# make a plot for L013 strain
L013_plot <- ggplot(L013, aes(x = Time, y = Frequency, group = BC_ID)) +
  geom_line(alpha = .2, colour = "#CC6633") + 
  ggtitle("L013_SC3") + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  xlab("Time, hours") + ylab("Log10(Barcode frequency)")

L013_plot

We make 2 graphs at the same time?

x <- ggplot(table_cleaned, aes(x = Time, y = Frequency, group = BC_ID)) +
  geom_line(alpha = .2, colour = "#000033") + 
  facet_grid(.~Strain) + 
  ggtitle("Barcode trajectories") + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  xlab("Time, hours") + ylab("Log10(Barcode frequency)")

x

ggplotly(x)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Lets pick one mutation and check how it behaves in different strains

# I've chosen Mut_ID==34
mut34 <- filter(table_cleaned, table_cleaned$Mut_ID == "34")
mut34
##      BC_ID Mut_ID Strain Environment Time Frequency
## 1     2034     34 DivAnc      SC_3.0    0 -4.886011
## 2     3833     34 DivAnc      SC_3.0    0 -3.662923
## 3     4886     34 DivAnc      SC_3.0    0 -3.384756
## 4     7291     34 DivAnc      SC_3.0    0 -3.080802
## 5    14408     34 DivAnc      SC_3.0    0 -5.731109
## 6    17665     34 DivAnc      SC_3.0    0 -5.731109
## 7    19725     34 DivAnc      SC_3.0    0 -5.731109
## 8    21930     34 DivAnc      SC_3.0    0 -3.271717
## 9    25361     34 DivAnc      SC_3.0    0 -2.966933
## 10   28247     34 DivAnc      SC_3.0    0 -2.658859
## 11   59536     34 DivAnc      SC_3.0    0 -2.993916
## 12   74450     34 DivAnc      SC_3.0    0 -2.842368
## 13   98890     34 DivAnc      SC_3.0    0 -2.938717
## 14  127444     34 DivAnc      SC_3.0    0 -5.430079
## 15  134101     34 DivAnc      SC_3.0    0 -3.258353
## 16  134569     34 DivAnc      SC_3.0    0 -5.430079
## 17  137077     34 DivAnc      SC_3.0    0 -3.142277
## 18  232780     34 DivAnc      SC_3.0    0 -5.731109
## 19  235317     34 DivAnc      SC_3.0    0 -5.731109
## 20  259214     34 DivAnc      SC_3.0    0 -3.097641
## 21    2034     34   L013      SC_7.0    0 -5.635196
## 22    2469     34   L013      SC_7.0    0 -5.635196
## 23    3833     34   L013      SC_7.0    0 -3.685806
## 24    4886     34   L013      SC_7.0    0 -3.407309
## 25    7291     34   L013      SC_7.0    0 -3.021354
## 26   19725     34   L013      SC_7.0    0 -5.635196
## 27   21930     34   L013      SC_7.0    0 -3.407309
## 28   25361     34   L013      SC_7.0    0 -3.160980
## 29   28247     34   L013      SC_7.0    0 -2.660224
## 30   59536     34   L013      SC_7.0    0 -3.123313
## 31   74450     34   L013      SC_7.0    0 -2.913386
## 32   98890     34   L013      SC_7.0    0 -3.262284
## 33  127444     34   L013      SC_7.0    0 -5.635196
## 34  134101     34   L013      SC_7.0    0 -3.181878
## 35  137077     34   L013      SC_7.0    0 -3.492181
## 36  235317     34   L013      SC_7.0    0 -5.635196
## 37  259214     34   L013      SC_7.0    0 -3.120648
## 38    2034     34 DivAnc      SC_3.0   24 -5.488946
## 39    2469     34 DivAnc      SC_3.0   24 -6.443188
## 40    3833     34 DivAnc      SC_3.0   24 -3.473307
## 41    4886     34 DivAnc      SC_3.0   24 -3.207912
## 42    7291     34 DivAnc      SC_3.0   24 -3.031737
## 43   17665     34 DivAnc      SC_3.0   24 -6.142158
## 44   19725     34 DivAnc      SC_3.0   24 -6.443188
## 45   21930     34 DivAnc      SC_3.0   24 -3.232870
## 46   25361     34 DivAnc      SC_3.0   24 -2.810832
## 47   28247     34 DivAnc      SC_3.0   24 -2.475687
## 48   59536     34 DivAnc      SC_3.0   24 -2.861239
## 49   74450     34 DivAnc      SC_3.0   24 -2.827133
## 50   95057     34 DivAnc      SC_3.0   24 -6.443188
## 51   98890     34 DivAnc      SC_3.0   24 -2.931171
## 52  127444     34 DivAnc      SC_3.0   24 -6.142158
## 53  127564     34 DivAnc      SC_3.0   24 -6.443188
## 54  134101     34 DivAnc      SC_3.0   24 -3.679760
## 55  134569     34 DivAnc      SC_3.0   24 -5.966067
## 56  137077     34 DivAnc      SC_3.0   24 -3.036138
## 57  155199     34 DivAnc      SC_3.0   24 -6.443188
## 58  232780     34 DivAnc      SC_3.0   24 -6.443188
## 59  259214     34 DivAnc      SC_3.0   24 -2.845274
## 60    2034     34   L013      SC_7.0   24 -5.409424
## 61    3833     34   L013      SC_7.0   24 -4.205304
## 62    4886     34   L013      SC_7.0   24 -3.469905
## 63    7291     34   L013      SC_7.0   24 -3.194580
## 64   13322     34   L013      SC_7.0   24 -5.108394
## 65   21930     34   L013      SC_7.0   24 -3.669061
## 66   25361     34   L013      SC_7.0   24 -3.603244
## 67   28247     34   L013      SC_7.0   24 -2.708720
## 68   59536     34   L013      SC_7.0   24 -3.106228
## 69   74450     34   L013      SC_7.0   24 -2.960718
## 70   98890     34   L013      SC_7.0   24 -3.368031
## 71  134101     34   L013      SC_7.0   24 -3.685148
## 72  134569     34   L013      SC_7.0   24 -5.409424
## 73  137077     34   L013      SC_7.0   24 -3.517330
## 74  155199     34   L013      SC_7.0   24 -5.409424
## 75  259214     34   L013      SC_7.0   24 -3.405103
## 76    2034     34 DivAnc      SC_3.0   48 -5.460186
## 77    2469     34 DivAnc      SC_3.0   48 -6.159156
## 78    3833     34 DivAnc      SC_3.0   48 -3.588613
## 79    4886     34 DivAnc      SC_3.0   48 -3.126134
## 80    7291     34 DivAnc      SC_3.0   48 -3.151408
## 81   13322     34 DivAnc      SC_3.0   48 -6.159156
## 82   14408     34 DivAnc      SC_3.0   48 -5.858126
## 83   17665     34 DivAnc      SC_3.0   48 -5.557096
## 84   19725     34 DivAnc      SC_3.0   48 -6.159156
## 85   21930     34 DivAnc      SC_3.0   48 -3.191140
## 86   25361     34 DivAnc      SC_3.0   48 -2.782579
## 87   28247     34 DivAnc      SC_3.0   48 -2.471003
## 88   59536     34 DivAnc      SC_3.0   48 -2.847614
## 89   74450     34 DivAnc      SC_3.0   48 -2.840883
## 90   95057     34 DivAnc      SC_3.0   48 -5.858126
## 91   98890     34 DivAnc      SC_3.0   48 -2.849100
## 92  127444     34 DivAnc      SC_3.0   48 -6.159156
## 93  127564     34 DivAnc      SC_3.0   48 -5.682034
## 94  134101     34 DivAnc      SC_3.0   48 -3.047557
## 95  134569     34 DivAnc      SC_3.0   48 -6.159156
## 96  137077     34 DivAnc      SC_3.0   48 -3.036940
## 97  155199     34 DivAnc      SC_3.0   48 -5.858126
## 98  211299     34 DivAnc      SC_3.0   48 -6.159156
## 99  259214     34 DivAnc      SC_3.0   48 -2.805624
## 100   3833     34   L013      SC_7.0   48 -4.725987
## 101   4886     34   L013      SC_7.0   48 -3.383564
## 102   7291     34   L013      SC_7.0   48 -3.315210
## 103  19418     34   L013      SC_7.0   48 -5.328047
## 104  19725     34   L013      SC_7.0   48 -5.328047
## 105  21930     34   L013      SC_7.0   48 -3.748264
## 106  25361     34   L013      SC_7.0   48 -4.005828
## 107  28247     34   L013      SC_7.0   48 -2.703765
## 108  59536     34   L013      SC_7.0   48 -3.234625
## 109  74450     34   L013      SC_7.0   48 -3.185032
## 110  98890     34   L013      SC_7.0   48 -4.097598
## 111 134101     34   L013      SC_7.0   48 -3.452986
## 112 137077     34   L013      SC_7.0   48 -3.665289
## 113 259214     34   L013      SC_7.0   48 -3.535655
## 114   2034     34 DivAnc      SC_3.0   72 -5.761999
## 115   2469     34 DivAnc      SC_3.0   72 -6.159939
## 116   3833     34 DivAnc      SC_3.0   72 -3.480057
## 117   4886     34 DivAnc      SC_3.0   72 -3.065293
## 118   7291     34 DivAnc      SC_3.0   72 -3.196623
## 119  13322     34 DivAnc      SC_3.0   72 -6.460969
## 120  14408     34 DivAnc      SC_3.0   72 -6.460969
## 121  17665     34 DivAnc      SC_3.0   72 -6.460969
## 122  19725     34 DivAnc      SC_3.0   72 -6.159939
## 123  21930     34 DivAnc      SC_3.0   72 -3.268401
## 124  25361     34 DivAnc      SC_3.0   72 -2.730833
## 125  28247     34 DivAnc      SC_3.0   72 -2.484527
## 126  52882     34 DivAnc      SC_3.0   72 -6.460969
## 127  59536     34 DivAnc      SC_3.0   72 -2.876072
## 128  74450     34 DivAnc      SC_3.0   72 -2.802767
## 129  98890     34 DivAnc      SC_3.0   72 -2.784733
## 130 127444     34 DivAnc      SC_3.0   72 -6.159939
## 131 134101     34 DivAnc      SC_3.0   72 -3.121319
## 132 134569     34 DivAnc      SC_3.0   72 -6.460969
## 133 137077     34 DivAnc      SC_3.0   72 -3.018646
## 134 155199     34 DivAnc      SC_3.0   72 -6.460969
## 135 211299     34 DivAnc      SC_3.0   72 -6.460969
## 136 226020     34 DivAnc      SC_3.0   72 -6.460969
## 137 232780     34 DivAnc      SC_3.0   72 -6.460969
## 138 235317     34 DivAnc      SC_3.0   72 -6.460969
## 139 259214     34 DivAnc      SC_3.0   72 -2.851801
## 140   2034     34   L013      SC_7.0   72 -5.874534
## 141   3833     34   L013      SC_7.0   72 -5.874534
## 142   4886     34   L013      SC_7.0   72 -3.955456
## 143   7291     34   L013      SC_7.0   72 -3.264940
## 144  17665     34   L013      SC_7.0   72 -5.874534
## 145  21930     34   L013      SC_7.0   72 -4.369384
## 146  25361     34   L013      SC_7.0   72 -3.945115
## 147  28247     34   L013      SC_7.0   72 -2.733084
## 148  59536     34   L013      SC_7.0   72 -3.258584
## 149  74450     34   L013      SC_7.0   72 -3.214618
## 150  98890     34   L013      SC_7.0   72 -3.865934
## 151 134101     34   L013      SC_7.0   72 -3.767324
## 152 134569     34   L013      SC_7.0   72 -5.874534
## 153 137077     34   L013      SC_7.0   72 -3.767324
## 154 155199     34   L013      SC_7.0   72 -5.874534
## 155 235317     34   L013      SC_7.0   72 -5.874534
## 156 259214     34   L013      SC_7.0   72 -4.272474
## 157   2034     34 DivAnc      SC_3.0   96 -5.566481
## 158   2469     34 DivAnc      SC_3.0   96 -6.344632
## 159   3833     34 DivAnc      SC_3.0   96 -3.493986
## 160   4886     34 DivAnc      SC_3.0   96 -3.112908
## 161   7291     34 DivAnc      SC_3.0   96 -3.092994
## 162  21930     34 DivAnc      SC_3.0   96 -3.468414
## 163  25361     34 DivAnc      SC_3.0   96 -2.760867
## 164  28247     34 DivAnc      SC_3.0   96 -2.472301
## 165  59536     34 DivAnc      SC_3.0   96 -2.833283
## 166  74450     34 DivAnc      SC_3.0   96 -2.820496
## 167  98890     34 DivAnc      SC_3.0   96 -2.785803
## 168 127444     34 DivAnc      SC_3.0   96 -6.043602
## 169 127564     34 DivAnc      SC_3.0   96 -6.344632
## 170 134101     34 DivAnc      SC_3.0   96 -3.255434
## 171 134569     34 DivAnc      SC_3.0   96 -5.742572
## 172 137077     34 DivAnc      SC_3.0   96 -3.019116
## 173 259214     34 DivAnc      SC_3.0   96 -2.816358
## 174   3833     34   L013      SC_7.0   96 -5.227104
## 175   4886     34   L013      SC_7.0   96 -3.963862
## 176   7291     34   L013      SC_7.0   96 -3.513893
## 177  21930     34   L013      SC_7.0   96 -5.403195
## 178  25361     34   L013      SC_7.0   96 -3.754835
## 179  28247     34   L013      SC_7.0   96 -2.870441
## 180  59536     34   L013      SC_7.0   96 -3.369771
## 181  74450     34   L013      SC_7.0   96 -3.636039
## 182  98890     34   L013      SC_7.0   96 -3.865376
## 183 115838     34   L013      SC_7.0   96 -5.704225
## 184 134101     34   L013      SC_7.0   96 -4.014029
## 185 134569     34   L013      SC_7.0   96 -5.704225
## 186 137077     34   L013      SC_7.0   96 -3.610803
## 187 235317     34   L013      SC_7.0   96 -5.704225
## 188 259214     34   L013      SC_7.0   96 -4.926074
ggplot(mut34, aes(Time, Frequency, group = BC_ID, color = BC_ID)) +
  geom_line() + 
  theme(legend.position = "none") + 
  facet_grid(.~Strain) + 
  ggtitle("Mutation_34") + 
  xlab("Time, hours") + ylab("Log10(Barcode frequency)") +
  theme(plot.title = element_text(hjust = 0.5))

What can you tell looking at this plot? Why do we have 2 clusters of barcodes?

Lets filter out barcodes with frequency > (-5) and use them for subsequent analysis.

mut34_f <- filter(mut34, mut34$Frequency > (-5))
mut34_f
##      BC_ID Mut_ID Strain Environment Time Frequency
## 1     2034     34 DivAnc      SC_3.0    0 -4.886011
## 2     3833     34 DivAnc      SC_3.0    0 -3.662923
## 3     4886     34 DivAnc      SC_3.0    0 -3.384756
## 4     7291     34 DivAnc      SC_3.0    0 -3.080802
## 5    21930     34 DivAnc      SC_3.0    0 -3.271717
## 6    25361     34 DivAnc      SC_3.0    0 -2.966933
## 7    28247     34 DivAnc      SC_3.0    0 -2.658859
## 8    59536     34 DivAnc      SC_3.0    0 -2.993916
## 9    74450     34 DivAnc      SC_3.0    0 -2.842368
## 10   98890     34 DivAnc      SC_3.0    0 -2.938717
## 11  134101     34 DivAnc      SC_3.0    0 -3.258353
## 12  137077     34 DivAnc      SC_3.0    0 -3.142277
## 13  259214     34 DivAnc      SC_3.0    0 -3.097641
## 14    3833     34   L013      SC_7.0    0 -3.685806
## 15    4886     34   L013      SC_7.0    0 -3.407309
## 16    7291     34   L013      SC_7.0    0 -3.021354
## 17   21930     34   L013      SC_7.0    0 -3.407309
## 18   25361     34   L013      SC_7.0    0 -3.160980
## 19   28247     34   L013      SC_7.0    0 -2.660224
## 20   59536     34   L013      SC_7.0    0 -3.123313
## 21   74450     34   L013      SC_7.0    0 -2.913386
## 22   98890     34   L013      SC_7.0    0 -3.262284
## 23  134101     34   L013      SC_7.0    0 -3.181878
## 24  137077     34   L013      SC_7.0    0 -3.492181
## 25  259214     34   L013      SC_7.0    0 -3.120648
## 26    3833     34 DivAnc      SC_3.0   24 -3.473307
## 27    4886     34 DivAnc      SC_3.0   24 -3.207912
## 28    7291     34 DivAnc      SC_3.0   24 -3.031737
## 29   21930     34 DivAnc      SC_3.0   24 -3.232870
## 30   25361     34 DivAnc      SC_3.0   24 -2.810832
## 31   28247     34 DivAnc      SC_3.0   24 -2.475687
## 32   59536     34 DivAnc      SC_3.0   24 -2.861239
## 33   74450     34 DivAnc      SC_3.0   24 -2.827133
## 34   98890     34 DivAnc      SC_3.0   24 -2.931171
## 35  134101     34 DivAnc      SC_3.0   24 -3.679760
## 36  137077     34 DivAnc      SC_3.0   24 -3.036138
## 37  259214     34 DivAnc      SC_3.0   24 -2.845274
## 38    3833     34   L013      SC_7.0   24 -4.205304
## 39    4886     34   L013      SC_7.0   24 -3.469905
## 40    7291     34   L013      SC_7.0   24 -3.194580
## 41   21930     34   L013      SC_7.0   24 -3.669061
## 42   25361     34   L013      SC_7.0   24 -3.603244
## 43   28247     34   L013      SC_7.0   24 -2.708720
## 44   59536     34   L013      SC_7.0   24 -3.106228
## 45   74450     34   L013      SC_7.0   24 -2.960718
## 46   98890     34   L013      SC_7.0   24 -3.368031
## 47  134101     34   L013      SC_7.0   24 -3.685148
## 48  137077     34   L013      SC_7.0   24 -3.517330
## 49  259214     34   L013      SC_7.0   24 -3.405103
## 50    3833     34 DivAnc      SC_3.0   48 -3.588613
## 51    4886     34 DivAnc      SC_3.0   48 -3.126134
## 52    7291     34 DivAnc      SC_3.0   48 -3.151408
## 53   21930     34 DivAnc      SC_3.0   48 -3.191140
## 54   25361     34 DivAnc      SC_3.0   48 -2.782579
## 55   28247     34 DivAnc      SC_3.0   48 -2.471003
## 56   59536     34 DivAnc      SC_3.0   48 -2.847614
## 57   74450     34 DivAnc      SC_3.0   48 -2.840883
## 58   98890     34 DivAnc      SC_3.0   48 -2.849100
## 59  134101     34 DivAnc      SC_3.0   48 -3.047557
## 60  137077     34 DivAnc      SC_3.0   48 -3.036940
## 61  259214     34 DivAnc      SC_3.0   48 -2.805624
## 62    3833     34   L013      SC_7.0   48 -4.725987
## 63    4886     34   L013      SC_7.0   48 -3.383564
## 64    7291     34   L013      SC_7.0   48 -3.315210
## 65   21930     34   L013      SC_7.0   48 -3.748264
## 66   25361     34   L013      SC_7.0   48 -4.005828
## 67   28247     34   L013      SC_7.0   48 -2.703765
## 68   59536     34   L013      SC_7.0   48 -3.234625
## 69   74450     34   L013      SC_7.0   48 -3.185032
## 70   98890     34   L013      SC_7.0   48 -4.097598
## 71  134101     34   L013      SC_7.0   48 -3.452986
## 72  137077     34   L013      SC_7.0   48 -3.665289
## 73  259214     34   L013      SC_7.0   48 -3.535655
## 74    3833     34 DivAnc      SC_3.0   72 -3.480057
## 75    4886     34 DivAnc      SC_3.0   72 -3.065293
## 76    7291     34 DivAnc      SC_3.0   72 -3.196623
## 77   21930     34 DivAnc      SC_3.0   72 -3.268401
## 78   25361     34 DivAnc      SC_3.0   72 -2.730833
## 79   28247     34 DivAnc      SC_3.0   72 -2.484527
## 80   59536     34 DivAnc      SC_3.0   72 -2.876072
## 81   74450     34 DivAnc      SC_3.0   72 -2.802767
## 82   98890     34 DivAnc      SC_3.0   72 -2.784733
## 83  134101     34 DivAnc      SC_3.0   72 -3.121319
## 84  137077     34 DivAnc      SC_3.0   72 -3.018646
## 85  259214     34 DivAnc      SC_3.0   72 -2.851801
## 86    4886     34   L013      SC_7.0   72 -3.955456
## 87    7291     34   L013      SC_7.0   72 -3.264940
## 88   21930     34   L013      SC_7.0   72 -4.369384
## 89   25361     34   L013      SC_7.0   72 -3.945115
## 90   28247     34   L013      SC_7.0   72 -2.733084
## 91   59536     34   L013      SC_7.0   72 -3.258584
## 92   74450     34   L013      SC_7.0   72 -3.214618
## 93   98890     34   L013      SC_7.0   72 -3.865934
## 94  134101     34   L013      SC_7.0   72 -3.767324
## 95  137077     34   L013      SC_7.0   72 -3.767324
## 96  259214     34   L013      SC_7.0   72 -4.272474
## 97    3833     34 DivAnc      SC_3.0   96 -3.493986
## 98    4886     34 DivAnc      SC_3.0   96 -3.112908
## 99    7291     34 DivAnc      SC_3.0   96 -3.092994
## 100  21930     34 DivAnc      SC_3.0   96 -3.468414
## 101  25361     34 DivAnc      SC_3.0   96 -2.760867
## 102  28247     34 DivAnc      SC_3.0   96 -2.472301
## 103  59536     34 DivAnc      SC_3.0   96 -2.833283
## 104  74450     34 DivAnc      SC_3.0   96 -2.820496
## 105  98890     34 DivAnc      SC_3.0   96 -2.785803
## 106 134101     34 DivAnc      SC_3.0   96 -3.255434
## 107 137077     34 DivAnc      SC_3.0   96 -3.019116
## 108 259214     34 DivAnc      SC_3.0   96 -2.816358
## 109   4886     34   L013      SC_7.0   96 -3.963862
## 110   7291     34   L013      SC_7.0   96 -3.513893
## 111  25361     34   L013      SC_7.0   96 -3.754835
## 112  28247     34   L013      SC_7.0   96 -2.870441
## 113  59536     34   L013      SC_7.0   96 -3.369771
## 114  74450     34   L013      SC_7.0   96 -3.636039
## 115  98890     34   L013      SC_7.0   96 -3.865376
## 116 134101     34   L013      SC_7.0   96 -4.014029
## 117 137077     34   L013      SC_7.0   96 -3.610803
## 118 259214     34   L013      SC_7.0   96 -4.926074

Plot again the same type of graph, but use filtered data. Make sure that you have done everything right.

ggplot(mut34_f, aes(Time, Frequency, group = BC_ID, color = BC_ID)) + 
  geom_line() + 
  theme(legend.position = "none") + 
  facet_grid(.~Strain) + 
  ggtitle("Mutation_34") + 
  xlab("Time, hours") + ylab("Log10(Barcode frequency)") +
  theme(plot.title = element_text(hjust = 0.5))

Most of the first plot shows parelle effects (good, so we can just average)

But in the second some things are different…

Lets focus again on mutation 34 and

ggplot(mut34_f, aes(Time, Frequency, colour = BC_ID, group = BC_ID)) + 
  geom_point() + 
  geom_smooth(se = FALSE, method = "lm") + 
  facet_grid(.~Strain) + 
  theme(legend.position = "none") + 
  ggtitle(paste("Mutation", 34, sep = "_")) + 
  xlab("Time, hours") + ylab("Log10(Barcode frequency)")

Now you chan choose a different mutation and check how it behaves across strains.

Now it’s time to estimate slope for each barcode. Lets create a file that will contain information about BC_ID, Mut_ID, Strain, and estimated slope.

# Lets become familiar with lm function:

# For this exercise, take the filtered data for mutation 34 (mut34_f) and filter out information about one barcode you like

# I have chosen BC_ID=25361 in DivAnc strain
BC_25361 <- filter(mut34_f, mut34_f$BC_ID == "25361", mut34_f$Strain == "DivAnc")

BC_25361
##   BC_ID Mut_ID Strain Environment Time Frequency
## 1 25361     34 DivAnc      SC_3.0    0 -2.966933
## 2 25361     34 DivAnc      SC_3.0   24 -2.810832
## 3 25361     34 DivAnc      SC_3.0   48 -2.782579
## 4 25361     34 DivAnc      SC_3.0   72 -2.730833
## 5 25361     34 DivAnc      SC_3.0   96 -2.760867
# Lets plot frequency of this barcode:
ggplot(BC_25361, aes(Time, Frequency, colour = BC_ID)) +
  geom_point() + 
  theme(legend.position = "none") + 
  ggtitle("BC_25361") + 
  xlab("Time, hours") + ylab("Log10(Frequency)")

# Lets use lm function to fit the line to these points:
ggplot(BC_25361, aes(Time, Frequency, colour = BC_ID)) +
  geom_point() + 
  geom_smooth(se = FALSE, method = "lm") + 
  theme(legend.position = "none") + 
  ggtitle("BC_25361") +
  xlab("Time, hours") + ylab("Log10(Frequency)")

# Lets check what data does lm function return:
regression_model <- lm(Frequency~Time, BC_25361)
summary_data <- summary(regression_model)
summary_data
## 
## Call:
## lm(formula = Frequency ~ Time, data = BC_25361)
## 
## Residuals:
##        1        2        3        4        5 
## -0.05810  0.04879  0.02783  0.03036 -0.04888 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.9088351  0.0443664 -65.564 7.82e-06 ***
## Time         0.0020506  0.0007547   2.717   0.0727 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05728 on 3 degrees of freedom
## Multiple R-squared:  0.7111, Adjusted R-squared:  0.6147 
## F-statistic: 7.383 on 1 and 3 DF,  p-value: 0.07273
# The information we are interested in is the value of Slopeand Intercept of this line:
# Let's try to access them:

# Time
Time <- summary_data$coefficients[2]
Time
## [1] 0.002050551
# Intercept:
Intercept <- summary_data$coefficients[1]
Intercept
## [1] -2.908835

Now we can find slopes for each barcode for each mutation in all strains.

# Lets create the file:
data_header <- matrix(data = NA, nrow = 1, ncol = 7)
data_header[1] <- "Mut_ID"
data_header[2] <- "BC_ID"
data_header[3] <- "Strain"
data_header[4] <- "Slope"
data_header[5] <- "Intercept"
data_header[6] <- "R^2"
write.table(data_header, "Tnseq_practice_output.csv", append = FALSE, sep = ",", eol = "\n", dec = ".", row.names = FALSE, col.names = FALSE)
for (mut in unique(table_cleaned$Mut_ID)) {
  mut_data <- filter(table_cleaned, table_cleaned$Mut_ID == paste(mut))
  # now we have all data for each mutation separately
  for (bc in unique(mut_data$BC_ID)) {
    # now we filtered data for each barcode within 1 mutation
    bc_mut_data <- filter(mut_data, mut_data$BC_ID == paste(bc))
    for (strain in unique(bc_mut_data$Strain)) {
      str_bc_mut_data <- filter(bc_mut_data, bc_mut_data$Strain == paste(strain))
      # only considering combinations with 3 or more data points - anything less is statistically insignificant
      if (nrow(str_bc_mut_data) > 2) {
        regression_model <- lm(Frequency~Time, str_bc_mut_data)
        summary_data <- summary(regression_model)
        # now write to the output file! Prepare the data array first
        data_output <- matrix(data = NA, nrow = 1, ncol = 6)
        data_output[1] <- mut
        data_output[2] <- bc
        data_output[3] <- strain
        # slope
        data_output[4] <- summary_data$coefficients[2]
        # intercept
        data_output[5] <- summary_data$coefficients[1]
        # r-squared
        data_output[6] <- summary_data$r.squared
        # time to write
        write.table(data_output, "Tnseq_practice_output.csv", append = TRUE, sep = ",", eol = "\n", dec = ".", row.names = FALSE, col.names = FALSE)
      }
    }
  }
}